O objeto principal da análise são as buscas e a navegação depois da busca. Criamos esses dados a partir dos dados originais da wikimedia em /data/search_data.csv.

Aqui, exploramos esses dados.

library(tidyverse)
package 㤼㸱tidyverse㤼㸲 was built under R version 3.4.4-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 2.2.1     v purrr   0.2.4
v tibble  1.4.2     v dplyr   0.7.4
v tidyr   0.7.2     v stringr 1.2.0
v readr   1.1.1     v forcats 0.2.0
package 㤼㸱ggplot2㤼㸲 was built under R version 3.4.4package 㤼㸱tibble㤼㸲 was built under R version 3.4.4-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(here)
package 㤼㸱here㤼㸲 was built under R version 3.4.4here() starts at C:/Users/marcosasn/Documents/gitlocal/lab2-cp4-marcosasn
library(lubridate)

Attaching package: 㤼㸱lubridate㤼㸲

The following object is masked from 㤼㸱package:here㤼㸲:

    here

The following object is masked from 㤼㸱package:base㤼㸲:

    date
library(shiny)
library(plotly)

Attaching package: 㤼㸱plotly㤼㸲

The following object is masked from 㤼㸱package:ggplot2㤼㸲:

    last_plot

The following object is masked from 㤼㸱package:stats㤼㸲:

    filter

The following object is masked from 㤼㸱package:graphics㤼㸲:

    layout
theme_set(theme_bw())
buscas = read_csv(here::here("data/search_data.csv")) %>%
    head(100000)
Parsed with column specification:
cols(
  session_id = col_character(),
  search_index = col_integer(),
  session_start_timestamp = col_double(),
  session_start_date = col_datetime(format = ""),
  group = col_character(),
  results = col_integer(),
  num_clicks = col_integer(),
  first_click = col_integer()
)
buscas %>% 
    ggplot(aes(x = results)) + 
    geom_histogram(binwidth = 5) 

#search_index quantidade de buscas na sessão
#What is our daily overall clickthrough rate? How does it vary between the groups?
#num_clicks
#group
#session_start_date
plot = buscas %>% 
    group_by(group) %>% 
    summarise(n = n()) %>% 
    ggplot(aes(x = group, y = n)) + 
    geom_col(
        aes(text = paste("Grupo:", group,
                         "<br>",
                         "Frequência:", n)),
        fill = "white", color = "blue") +
    ggtitle("Distribuição da frequência do grupo da sessão") +
    xlab("Grupo") + 
    ylab("Frequência") +
  theme(plot.title = element_text(hjust = 0.5), legend.position="none")
Ignoring unknown aesthetics: text
div(ggplotly(plot, tooltip = "text", width = 700, height = 400),  align = "center")
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`

plot = buscas %>% 
    ggplot(aes(x = group, y = num_clicks)) + 
    geom_jitter(aes(text = paste("Grupo:",group,
                                 "<br>Quantidade índices da busca:",search_index)),
                alpha = .4, width = .2, size = .8, color = "blue") +
    ggtitle("Distribuição da quantidade de índices da busca") +
    xlab("Grupo") + 
    ylab("Quantidade índices da busca") +
  theme(plot.title = element_text(hjust = 0.5), legend.position="none")
Ignoring unknown aesthetics: text
div(ggplotly(plot, tooltip = "text", width = 700, height = 400),  align = "center")
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`

plot = buscas %>% 
    ggplot(aes(x = group, y = num_clicks)) + 
    geom_jitter(aes(text = paste("Grupo:",group,
                                 "<br>Quantidade índices da busca:",search_index)),
                alpha = .4, width = .2, size = .8, color = "blue") +
    scale_y_log10() +
    ggtitle("Distribuição da quantidade de índices da busca") +
    xlab("Grupo") + 
    ylab("Quantidade índices da busca") +
  theme(plot.title = element_text(hjust = 0.5), legend.position="none")
Ignoring unknown aesthetics: text
div(ggplotly(plot, tooltip = "text", width = 700, height = 400),  align = "center")
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`
Transformation introduced infinite values in continuous y-axis

plot = buscas %>% 
    ggplot(aes(x= num_clicks)) + 
    geom_histogram(binwidth = 5, fill = "white", color = "blue") + 
    facet_grid(group ~ .) +
    ggtitle("Distribuição da frequência da quantidade de índices da busca") +
    xlab("Quantidade índices da busca") + 
    ylab("Frequência") +
  theme(plot.title = element_text(hjust = 0.5), legend.position="none")
div(ggplotly(plot, tooltip = "text", width = 700, height = 400),  align = "center")
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`
LS0tDQp0aXRsZTogIkVEQSBidXNjYXMiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpPIG9iamV0byBwcmluY2lwYWwgZGEgYW7DoWxpc2Ugc8OjbyBhcyBidXNjYXMgZSBhIG5hdmVnYcOnw6NvIGRlcG9pcyBkYSBidXNjYS4gQ3JpYW1vcyBlc3NlcyBkYWRvcyBhIHBhcnRpciBkb3MgZGFkb3Mgb3JpZ2luYWlzIGRhIHdpa2ltZWRpYSBlbSBgL2RhdGEvc2VhcmNoX2RhdGEuY3N2YC4gDQoNCkFxdWksIGV4cGxvcmFtb3MgZXNzZXMgZGFkb3MuIA0KDQpgYGB7ciBzZXR1cH0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShoZXJlKQ0KbGlicmFyeShsdWJyaWRhdGUpDQpsaWJyYXJ5KHNoaW55KQ0KbGlicmFyeShwbG90bHkpDQp0aGVtZV9zZXQodGhlbWVfYncoKSkNCmBgYA0KDQpgYGB7ciBFVEx9DQpidXNjYXMgPSByZWFkX2NzdihoZXJlOjpoZXJlKCJkYXRhL3NlYXJjaF9kYXRhLmNzdiIpKSAlPiUNCiAgICBoZWFkKDEwMDAwMCkNCmBgYA0KDQpgYGB7cn0NCmJ1c2NhcyAlPiUgDQogICAgZ2dwbG90KGFlcyh4ID0gcmVzdWx0cykpICsgDQogICAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSA1KSANCmBgYA0KDQpgYGB7cn0NCiNzZWFyY2hfaW5kZXggcXVhbnRpZGFkZSBkZSBidXNjYXMgbmEgc2Vzc8Ojbw0KDQoNCiNXaGF0IGlzIG91ciBkYWlseSBvdmVyYWxsIGNsaWNrdGhyb3VnaCByYXRlPyBIb3cgZG9lcyBpdCB2YXJ5IGJldHdlZW4gdGhlIGdyb3Vwcz8NCiNudW1fY2xpY2tzDQojZ3JvdXANCiNzZXNzaW9uX3N0YXJ0X2RhdGUNCg0KcGxvdCA9IGJ1c2NhcyAlPiUgDQogICAgZ3JvdXBfYnkoZ3JvdXApICU+JSANCiAgICBzdW1tYXJpc2UobiA9IG4oKSkgJT4lIA0KICAgIGdncGxvdChhZXMoeCA9IGdyb3VwLCB5ID0gbikpICsgDQogICAgZ2VvbV9jb2woDQogICAgICAgIGFlcyh0ZXh0ID0gcGFzdGUoIkdydXBvOiIsIGdyb3VwLA0KICAgICAgICAgICAgICAgICAgICAgICAgICI8YnI+IiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAiRnJlcXXDqm5jaWE6IiwgbikpLA0KICAgICAgICBmaWxsID0gIndoaXRlIiwgY29sb3IgPSAiYmx1ZSIpICsNCiAgICBnZ3RpdGxlKCJEaXN0cmlidWnDp8OjbyBkYSBmcmVxdcOqbmNpYSBkbyBncnVwbyBkYSBzZXNzw6NvIikgKw0KICAgIHhsYWIoIkdydXBvIikgKyANCiAgICB5bGFiKCJGcmVxdcOqbmNpYSIpICsNCiAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSksIGxlZ2VuZC5wb3NpdGlvbj0ibm9uZSIpDQoNCmRpdihnZ3Bsb3RseShwbG90LCB0b29sdGlwID0gInRleHQiLCB3aWR0aCA9IDcwMCwgaGVpZ2h0ID0gNDAwKSwgIGFsaWduID0gImNlbnRlciIpDQoNCnBsb3QgPSBidXNjYXMgJT4lIA0KICAgIGdncGxvdChhZXMoeCA9IGdyb3VwLCB5ID0gbnVtX2NsaWNrcykpICsgDQogICAgZ2VvbV9qaXR0ZXIoYWVzKHRleHQgPSBwYXN0ZSgiR3J1cG86Iixncm91cCwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICI8YnI+UXVhbnRpZGFkZSDDrW5kaWNlcyBkYSBidXNjYToiLHNlYXJjaF9pbmRleCkpLA0KICAgICAgICAgICAgICAgIGFscGhhID0gLjQsIHdpZHRoID0gLjIsIHNpemUgPSAuOCwgY29sb3IgPSAiYmx1ZSIpICsNCiAgICBnZ3RpdGxlKCJEaXN0cmlidWnDp8OjbyBkYSBxdWFudGlkYWRlIGRlIMOtbmRpY2VzIGRhIGJ1c2NhIikgKw0KICAgIHhsYWIoIkdydXBvIikgKyANCiAgICB5bGFiKCJRdWFudGlkYWRlIMOtbmRpY2VzIGRhIGJ1c2NhIikgKw0KICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGhqdXN0ID0gMC41KSwgbGVnZW5kLnBvc2l0aW9uPSJub25lIikNCg0KZGl2KGdncGxvdGx5KHBsb3QsIHRvb2x0aXAgPSAidGV4dCIsIHdpZHRoID0gNzAwLCBoZWlnaHQgPSA0MDApLCAgYWxpZ24gPSAiY2VudGVyIikNCg0KcGxvdCA9IGJ1c2NhcyAlPiUgDQogICAgZ2dwbG90KGFlcyh4ID0gZ3JvdXAsIHkgPSBudW1fY2xpY2tzKSkgKyANCiAgICBnZW9tX2ppdHRlcihhZXModGV4dCA9IHBhc3RlKCJHcnVwbzoiLGdyb3VwLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIjxicj5RdWFudGlkYWRlIMOtbmRpY2VzIGRhIGJ1c2NhOiIsc2VhcmNoX2luZGV4KSksDQogICAgICAgICAgICAgICAgYWxwaGEgPSAuNCwgd2lkdGggPSAuMiwgc2l6ZSA9IC44LCBjb2xvciA9ICJibHVlIikgKw0KICAgIHNjYWxlX3lfbG9nMTAoKSArDQogICAgZ2d0aXRsZSgiRGlzdHJpYnVpw6fDo28gZGEgcXVhbnRpZGFkZSBkZSDDrW5kaWNlcyBkYSBidXNjYSIpICsNCiAgICB4bGFiKCJHcnVwbyIpICsgDQogICAgeWxhYigiUXVhbnRpZGFkZSDDrW5kaWNlcyBkYSBidXNjYSIpICsNCiAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSksIGxlZ2VuZC5wb3NpdGlvbj0ibm9uZSIpDQoNCmRpdihnZ3Bsb3RseShwbG90LCB0b29sdGlwID0gInRleHQiLCB3aWR0aCA9IDcwMCwgaGVpZ2h0ID0gNDAwKSwgIGFsaWduID0gImNlbnRlciIpDQoNCnBsb3QgPSBidXNjYXMgJT4lIA0KICAgIGdncGxvdChhZXMoeD0gbnVtX2NsaWNrcykpICsgDQogICAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSA1LCBmaWxsID0gIndoaXRlIiwgY29sb3IgPSAiYmx1ZSIpICsgDQogICAgZmFjZXRfZ3JpZChncm91cCB+IC4pICsNCiAgICBnZ3RpdGxlKCJEaXN0cmlidWnDp8OjbyBkYSBmcmVxdcOqbmNpYSBkYSBxdWFudGlkYWRlIGRlIMOtbmRpY2VzIGRhIGJ1c2NhIikgKw0KICAgIHhsYWIoIlF1YW50aWRhZGUgw61uZGljZXMgZGEgYnVzY2EiKSArIA0KICAgIHlsYWIoIkZyZXF1w6puY2lhIikgKw0KICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGhqdXN0ID0gMC41KSwgbGVnZW5kLnBvc2l0aW9uPSJub25lIikNCg0KZGl2KGdncGxvdGx5KHBsb3QsIHRvb2x0aXAgPSAidGV4dCIsIHdpZHRoID0gNzAwLCBoZWlnaHQgPSA0MDApLCAgYWxpZ24gPSAiY2VudGVyIikNCmBgYA0KDQpgYGB7cn0NCg0KYGBgDQoNCg==